Packages

library(tidyverse)
library(readxl)
library(sf)
library(leaflet)

Read geocoded entries data

geocoded_entries <- read_excel(path = "geocoded_entries.xlsx", col_names = TRUE)

head(geocoded_entries)

Geocoded information

What is the proportion of addresses detected from NER which are precisely geocoded?

geocoded_entries %>%
  select(precise.geom, entry_id) %>%
  mutate(geometry = if_else(is.na(precise.geom), FALSE, TRUE)) %>%
  group_by(geometry) %>%
  summarise(n = n()) %>%
  mutate(freq = n/sum(n)) %>%
  ggplot(mapping = aes(x = geometry, y = freq)) + # y as frequency
  geom_bar(stat = "identity") +
  theme_bw() +
  ggtitle("<LOC> and <CARD> elements precisely geocoded from the entries") +
  labs(caption = "ANR SoDUCo. Data: GeoHistoricalData")

What is the proportion of entries which are precisely geocoded?

order1 <- geocoded_entries %>%
  select(precise.geom, entry_id, order) %>%
  mutate(geometry = if_else(is.na(precise.geom), "FAUX", "VRAI")) %>%
  group_by(entry_id) %>%
  mutate(order_n = n()) %>%
  filter(order_n == 1) %>%
  ungroup() %>%
  group_by(geometry) %>%
  summarise(n = n()) %>%
  mutate(freq = n/sum(n)) %>%
  mutate(order_n = 1) %>%
  select(order_n, geometry:freq)

notorder1 <- geocoded_entries %>%
  select(precise.geom, entry_id, order) %>%
  mutate(geometry = if_else(is.na(precise.geom), "FAUX", "VRAI")) %>%
  group_by(entry_id) %>%
  mutate(order_n = n()) %>%
  filter(order_n != 1) %>% 
  ungroup() %>% 
  group_by(order_n) %>% 
  summarise(FAUX = sum(str_count(string = geometry, pattern = "FAUX")), 
            VRAI = sum(str_count(string = geometry, pattern = "VRAI"))) %>%
  pivot_longer(cols = FAUX:VRAI, names_to = "geometry", values_to = "n") %>%
  group_by(order_n) %>%
  mutate(freq = n/sum(n))

# very few entries with more than 8 localisations
order1 %>% bind_rows(notorder1) %>% filter(order_n > 8) %>% summarise(countelements = sum(n))
order1 %>%
  bind_rows(notorder1) %>% 
  filter(order_n < 8) %>%
  ggplot(mapping = aes(x = geometry, y = freq, fill = as.character(order_n))) + # y as frequency
  geom_bar(stat = "identity", position = "dodge") +
  theme_bw() +
  ggtitle("Entries precisely geocoded") +
  labs(caption = "ANR SoDUCo. Data: GeoHistoricalData")

Mapping entries: 1839

As we can see, some geocoding are far from Paris as the entries in street St-Martin located on the island.

Mapping entries: 1845

Entries only in the Ile-de-France region

idf <- st_read(dsn = "iledefranceosm.gpkg")
## Reading layer `iledefranceosm' from data source 
##   `/data/user/g/jgravier/JE_soduco_2022_10_11/iledefranceosm.gpkg' 
##   using driver `GPKG'
## Simple feature collection with 1 feature and 113 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 1.446244 ymin: 48.12015 xmax: 3.559221 ymax: 49.24143
## Geodetic CRS:  WGS 84

Mapping entries: 1855

Mapping entries: 1855

Mapping entries: 1864

Mapping entries: 1875

Mapping entries: 1885

Mapping entries: 1893

Mapping entries: 1904